R Setup, getting all data organized

TASKS/QUESTIONS:

  • In data:
  • FB = field blank? do we include this?
  • EICE = what is that?
  • Samples ID with EAST is ’South” and WEST is “East”? that was the match up in old data
# Install Packages
# install.packages(c("ggplot2", "ggforce", "forecast", "ggfortify", "ggpubr", "car", "dplyr","rstatix", "coin", "multcomp", "extrafont", "extrafontdb", "ggthemes"))
# library(ggplot2)
library(ggforce)
library(forecast)
library(ggfortify)
library(ggpubr)
library(car)
library(heatmaply)
library(scales)
library(rstatix)
library(ggpubr)
library(coin)
library(multcomp)
library(extrafont)
library(extrafontdb)
library(ggthemes)
library(rcartocolor)
# install.packages("Polychrome")
library(Polychrome)
library(tidyverse)
library(here)

# get data ready
# here::i_am(path = "./Code/SERDP_report_support/CompleteDatasets_rerun.R")

# import new files (summer 2024) -----
d_jba_wat_new<-read.csv(here("./Data/SERDP_report_support/410-165666_JBA_water_data_reformatted.csv"))
d_jba_sed_new<-read.csv(here("./Data/SERDP_report_support/410-173731_JBA_sediment_data_reformatted.csv"))
d_wg_wat_new<-read.csv(here("./Data/SERDP_report_support/410-169030_WG_water_data_reformatted.csv"))

# import old data (pre 2024) -----
d_jba_biota<-read.csv(here("./Data/Original_Brown_etal/JBA/JBA_Biota_Data.csv"))
d_jba_wat<-read.csv(here("./Data/Original_Brown_etal/JBA/JBA_Water_Data.csv"))
d_jba_sed<-read.csv(here("./Data/Original_Brown_etal/JBA/JBA_Sediment_Data.csv"))

d_wg_biota<-read.csv(here("./Data/Original_Brown_etal/WG/WG_Biota_Data.csv"))
d_wg_wat<-read.csv(here("./Data/Original_Brown_etal/WG/WG_Water_Data.csv"))
d_wg_sed<-read.csv(here("./Data/Original_Brown_etal/WG/WG_Sediment_Data.csv"))

# make data long and get the same colnames between two systems - water 
# data needs: long format 

# WG (sed, water, sed) -------
d_wg_wat_noSum <- d_wg_wat %>%
  dplyr::rename(Spp_Loc = Location) %>%
  dplyr::select(SampleID, Spp_Loc, SampleDate, Precipitation,
                PFBA, PFPeA, PFBS, PFHxA, PFPeS,
                PFHpA, PFHxS, PFOA, SixTwoFTS,
                PFHpS, PFNA, PFOS) %>% # 12 PFAS
  pivot_longer(cols = c(PFBA, PFPeA, PFBS, PFHxA, PFPeS,
         PFHpA, PFHxS,PFOA, SixTwoFTS, PFHpS, PFNA, PFOS),
         names_to = "Analyte") %>%
  mutate(Media = "Surface Water",
         SampleDate = strptime(SampleDate, format = "%d-%b-%y"))

d_wg_sed_noSum <- d_wg_sed %>%
  dplyr::rename(Spp_Loc = Location) %>%
  dplyr::select(SampleID, Spp_Loc, SampleDate, PFBA, PFPeA, PFBS, PFHxA, PFPeS,
         PFHpA, PFHxS,PFOA, SixTwoFTS, PFHpS, PFNA, PFOS) %>% # 12 PFAS
  pivot_longer(cols = c(PFBA, PFPeA, PFBS, PFHxA, PFPeS,
         PFHpA, PFHxS,PFOA, SixTwoFTS, PFHpS, PFNA, PFOS),
         names_to = "Analyte") %>%
  mutate(Media = "Sediment",
         SampleDate = strptime(SampleDate, format = "%d-%b-%y"))
d_wg_sed_noSum<-tibble::add_column(d_wg_sed_noSum, Precipitation = NA, .before = "Analyte")

d_wg_biota_noSum <- d_wg_biota %>%
  dplyr::rename(Spp_Loc = Species) %>%
  dplyr::select(SampleID, Spp_Loc, SampleDate, PFBA, PFPeA, PFBS, PFHxA, PFPeS,
         PFHpA, PFHxS,PFOA, SixTwoFTS, PFHpS, PFNA, PFOS) %>% # 12 PFAS
  pivot_longer(cols = c(PFBA, PFPeA, PFBS, PFHxA, PFPeS,
         PFHpA, PFHxS,PFOA, SixTwoFTS, PFHpS, PFNA, PFOS),
         names_to = "Analyte") %>%
  mutate(Media = "Biota",
         SampleDate = strptime(SampleDate, format = "%d-%b-%y"))
d_wg_biota_noSum<-tibble::add_column(d_wg_biota_noSum, Precipitation = NA, .before = "Analyte")

# JBA (sed, wat, biota) -------
d_jba_wat_noSum <- d_jba_wat %>%
  dplyr::rename(Spp_Loc = Location) %>%
  dplyr::select(Spp_Loc, SampleDate, Precipitation, 
                PFBA, PFPrS,  ThreeThreeFTCA ,PFPeA ,PFBS, 
                FourTwoFTS, PFHxA, PFPeS, FiveThreeFTCA, PFHpA,
                ADONA, PFHxS, SixTwoFTS, PFOA, PFHpS,
                SevenThreeFTCA, PFNA, PFOSA, PFOS, PFDA,
                EightTwoFTS , PFNS , MeFOSAA ,EtFOSAA, PFDS, 
                TenTwoFTS, PFDoA , MeFOSA , PFTrDA, PFDoS,
                PFTeDA, PFHxDA) %>% # 32 PFAS
  pivot_longer(cols = c(PFBA, PFPrS,  ThreeThreeFTCA ,PFPeA ,PFBS, 
                FourTwoFTS, PFHxA, PFPeS, FiveThreeFTCA, PFHpA,
                ADONA, PFHxS, SixTwoFTS, PFOA, PFHpS,
                SevenThreeFTCA, PFNA, PFOSA, PFOS, PFDA,
                EightTwoFTS , PFNS , MeFOSAA ,EtFOSAA, PFDS, 
                TenTwoFTS, PFDoA , MeFOSA , PFTrDA, PFDoS,
                PFTeDA, PFHxDA),
         names_to = "Analyte") %>%
  mutate(Media = "Surface Water",
         SampleDate = strptime(SampleDate, format = "%d-%b-%y"))

d_jba_sed_noSum <- d_jba_sed %>%
  dplyr::rename(Spp_Loc = Location,
                SamepleID = Sample.ID) %>%
  dplyr::select(Spp_Loc, SampleDate, 
                PFBA , PFPeA , PFHxA , PFHpA , PFOA ,
                PFNA , PFDA ,PFUnA , PFDoA, PFTrDA,
                PFTeDA, PFBS,PFPeS , PFHxS ,PFHpS,
                PFOS , PFNS,PFDS,PFDoS , FourTwoFTS,
                SixTwoFTS,EightTwoFTS, PFOSA , MeFOSA, MeFOSAA,
                EtFOSAA, ADONA, ThreeThreeFTCA,
                FiveThreeFTCA, SevenThreeFTCA) %>% # 30 PFAS
  pivot_longer(cols = c(PFBA , PFPeA , PFHxA , PFHpA , PFOA ,
                PFNA , PFDA ,PFUnA , PFDoA, PFTrDA,
                PFTeDA, PFBS,PFPeS , PFHxS ,PFHpS,
                PFOS , PFNS,PFDS,PFDoS , FourTwoFTS,
                SixTwoFTS,EightTwoFTS, PFOSA , MeFOSA, MeFOSAA,
                EtFOSAA, ADONA, ThreeThreeFTCA, FiveThreeFTCA, SevenThreeFTCA),
               names_to = "Analyte") %>%
  mutate(Media = "Sediment",
         SampleDate = strptime(SampleDate, format = "%d-%b-%y"))
d_jba_sed_noSum<-tibble::add_column(d_jba_sed_noSum, Precipitation = NA, .before = "Analyte")


# d_jba_biota_noSum <- d_jba_biota %>%
#   dplyr::rename(Spp_Loc = Species) %>%
#   dplyr::select(SampleID, Spp_Loc, SampleDate, 
#                 ) %>% # 12 PFAS
#   pivot_longer(cols = c(PFBA, PFPeA, PFBS, PFHxA, PFPeS,
#          PFHpA, PFHxS,PFOA, `6:2 FTS`, PFHpS, PFNA, PFOS),
#          names_to = "Analyte") %>%
#   mutate(Media = "Biota",
#          SampleDate = strptime(SampleDate, format = "%d-%b-%y"))
# function to wrangle the new data ------
# data<-d_jba_sed_new
# location<-"JBA"
# media<-"Sediment"

new_data_prep<-function(location, data, media){
  
  d_new<-data
  
  if(location == "WG"){
    d_new$Location<-NA
    d_new[which(grepl(pattern = "WEST", d_new$Sample_ID)),"Location"]<-"North"
    d_new[which(grepl(pattern = "EAST", d_new$Sample_ID)),"Location"]<-"South"
    d_new[which(grepl(pattern = "HEAD", d_new$Sample_ID)),"Location"]<-"Head"
    d_new[which(grepl(pattern = "SPILL", d_new$Sample_ID)),"Location"]<-"Spill"
    d_new[which(grepl(pattern = "ROAD", d_new$Sample_ID)),"Location"]<-"Road"
    d_new[which(grepl(pattern = "FB", d_new$Sample_ID)),"Location"]<-"FB"
    d_new[which(grepl(pattern = "EICE", d_new$Sample_ID)),"Location"]<-"EICE"
  }
  if(location == "JBA"){
    d_new$Location<-NA
    d_new[which(grepl(pattern = "RIPP", d_new$Sample_ID)),"Location"]<-"Ripp"
    d_new[which(grepl(pattern = "SPILL", d_new$Sample_ID)),"Location"]<-"Spill"
    d_new[which(grepl(pattern = "EXIT", d_new$Sample_ID)),"Location"]<-"Exit"
    d_new[which(grepl(pattern = "DOWN", d_new$Sample_ID)),"Location"]<-"Down"
  }
  
  # get the PFAS abbreviation
  d_new$Analyte<-sub('.*\\(', '', d_new$Analyte)
  d_new$Analyte<-sub('\\)', '', d_new$Analyte)
  
  # summarize and keep long data format
  d_new <- d_new %>% 
    dplyr::select(Sample_ID, Location, Sampling_Date, Analyte, Result.Value) %>% 
    mutate(Result.Value = as.numeric(Result.Value)) %>% 
    dplyr::rename(SampleID = Sample_ID,
           Spp_Loc = Location, 
           SampleDate = Sampling_Date, 
           value = Result.Value) %>% 
    mutate(Media = media,
          SampleDate = strptime(SampleDate, format = "%m/%d/%Y %H:%M:%S"))
  
  return(d_new)
}

d_jba_wat_2024<-new_data_prep(data = d_jba_wat_new, 
              location = "JBA",
              media = "Surface Water")

d_wg_wat_2024<-new_data_prep(data = d_wg_wat_new, 
              location = "WG",
              media = "Surface Water")
d_wg_wat_2024$Analyte<-factor(d_wg_wat_2024$Analyte)

levels(d_wg_wat_2024$Analyte)[levels(d_wg_wat_2024$Analyte)=="4:2 FTS"] <- "FourTwoFTS"
levels(d_wg_wat_2024$Analyte)[levels(d_wg_wat_2024$Analyte)=="6:2 FTS"] <- "SixTwoFTS"
levels(d_wg_wat_2024$Analyte)[levels(d_wg_wat_2024$Analyte)=="8:2 FTS"] <- "EightTwoFTS"
levels(d_wg_wat_2024$Analyte)[levels(d_wg_wat_2024$Analyte)=="7:3 FTCA"] <- "SevenThreeFTCA"
levels(d_wg_wat_2024$Analyte)[levels(d_wg_wat_2024$Analyte)=="5:3 FTCA"] <- "FiveThreeFTCA"
levels(d_wg_wat_2024$Analyte)[levels(d_wg_wat_2024$Analyte)=="3:3 FTCA"] <- "ThreeThreeFTCA"

d_wg_wat_2024<-tibble::add_column(d_wg_wat_2024, Precipitation = NA, .before = "Analyte")

# scales::show_col(carto_pal(nColor, "Safe"))
# create your own color palette (32 colors) based on `seedcolors`
P50 = createPalette(50,  c("#ff0000", "#00ff00", "#0000ff"))
all_wg_data<-rbind(d_wg_wat_noSum, d_wg_sed_noSum, d_wg_biota_noSum, d_wg_wat_2024)
names(P50)<-levels(all_wg_data$Analyte)
# apply the function to get the data

Figure 2 (Brown et al 2023)

Abbi: This plot should be a bar plot of biota, water, and sediment PFAS percent compositions (Figure 2 in paper)

TASKS/QUESTIONS:

  • Pick a color scheme?
  • Biota - is a specific tissue plotted?
fig2_all<-ggplot(rbind(d_wg_wat_noSum,
                       d_wg_sed_noSum,
                       d_wg_biota_noSum,
                       d_wg_wat_2024),
       aes(x=Media, y=value, fill=Analyte)) + 
  geom_bar(position='fill', width = 0.5, stat='identity') +
  xlab("") +
  scale_fill_manual(values = P50)+
  ylab("PFAS Percent Composition") +
  theme_bw()

fig2_new<-ggplot(rbind(d_wg_wat_2024),
       aes(x=Media, y=value, fill=Analyte)) + 
  geom_bar(position='fill', width = 0.5, stat='identity') +
  xlab("") +
  ylab("PFAS Percent Composition") +
  theme_bw()

fig2_all

# fig2_new

Figure 3 (Brown et al 2023)

Abbi: This plot should be the line chart with the 4 sampling locations and sum PFAS surface water concentrations over time (Figure 3 in the paper). For this one I inserted the second y axis and scaled it to the data (Not sure if you know what I mean, but if you can start with recreating the line plot I can add the second y axis because I can’t find that code right now so i’ll have to re-do it, but here’s code to start!)

TASKS/QUESTIONS:

  • Pick a color scheme?
  • Biota - is a specific tissue plotted?
# get data first summarise sum PFAS by date
p.full<-rbind(d_wg_wat_noSum, d_wg_wat_2024) %>% 
  mutate(SampleID = factor(SampleID)) %>% 
  dplyr::group_by(SampleID, Spp_Loc, Media, SampleDate) %>% 
  dplyr::summarise(SumPFAS = sum(value, na.rm = T)) %>% 
  ggplot(aes(x=as.Date(SampleDate), y=SumPFAS, group=Spp_Loc, color=Spp_Loc)) +
    annotate("rect", xmin = as.Date("2020-10-1"), xmax = as.Date("2020-11-20"),
             ymin = -Inf, ymax = Inf,
             alpha = .1, fill = "black")+
    annotate("rect", xmin = as.Date("2021-01-30"), xmax = as.Date("2021-04-01"),
             ymin = -Inf, ymax = Inf,
             alpha = .1, fill = "black")+
    annotate("rect", xmin = as.Date("2021-06-01"), xmax = as.Date("2021-07-30"),
             ymin = -Inf, ymax = Inf,
             alpha = .1, fill = "black")+# geom_line(alpha = 0.3)+
    geom_point(size=0.5)+
    ylab("Sum PFAS Concentration (ng/L)") +
    xlab("")+
    scale_y_continuous(breaks=c(500,1000,1500,2000,2500, 3000, 3500)) +
    scale_x_date(date_labels = "%b %d", breaks = "2 months")+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 45, hjust=1),
          panel.grid.minor = element_blank())

p.A<-rbind(d_wg_wat_2024) %>%
  mutate(SampleID = factor(SampleID)) %>%
  dplyr::group_by(SampleID, Spp_Loc, Media, SampleDate) %>%
  dplyr::summarise(SumPFAS = sum(value, na.rm = T)) %>%
  filter(SampleDate > as.Date("2021-01-30") & SampleDate < as.Date("2021-04-01")) %>%     ggplot(aes(x=as.Date(SampleDate), y=SumPFAS, group=Spp_Loc, color=Spp_Loc)) +
    geom_line()+
    geom_point(size=0.5)+
    ylab("Sum PFAS Concentration (ng/L)") +
    xlab("")+
    scale_y_continuous(breaks=c(500,1000,1500,2000,2500, 3000, 3500)) +
    scale_x_date(date_labels = "%b %d", breaks = "1 week")+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 45, hjust=1),
          panel.grid.minor = element_blank(),
          legend.position = "none")


p.B<-rbind(d_wg_wat_2024) %>%
  mutate(SampleID = factor(SampleID)) %>%
  dplyr::group_by(SampleID, Spp_Loc, Media, SampleDate) %>%
  dplyr::summarise(SumPFAS = sum(value, na.rm = T)) %>%
  filter(SampleDate > as.Date("2021-06-01") & SampleDate < as.Date("2021-07-30")) %>% 
    ggplot(aes(x=as.Date(SampleDate), y=SumPFAS, group=Spp_Loc, color=Spp_Loc)) +
    geom_line()+
    geom_point(size=0.5)+
    ylab("Sum PFAS Concentration (ng/L)") +
    xlab("")+
    scale_y_continuous(breaks=c(500,1000,1500,2000,2500, 3000, 3500)) +
    scale_x_date(date_labels = "%b %d", breaks = "1 week")+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 45, hjust=1),
          panel.grid.minor = element_blank(),
          legend.position = "none")


p.C<-rbind(d_wg_wat_noSum) %>% 
  mutate(SampleID = factor(SampleID)) %>% 
  dplyr::group_by(SampleID, Spp_Loc, Media, SampleDate) %>% 
  dplyr::summarise(SumPFAS = sum(value, na.rm = T)) %>% 
    ggplot(aes(x=as.Date(SampleDate), y=SumPFAS, group=Spp_Loc, color=Spp_Loc)) +
    geom_line()+
    geom_point(size=0.5)+
    ylab("Sum PFAS Concentration (ng/L)") +
    xlab("")+
    scale_y_continuous(breaks=c(500,1000,1500,2000,2500, 3000, 3500)) +
    scale_x_date(date_labels = "%b %d", breaks = "1 week")+
    theme_bw()+
    theme(axis.text.x = element_text(angle = 45, hjust=1),
          panel.grid.minor = element_blank(),
          legend.position = "none")

row2<-cowplot::plot_grid(p.C, p.A, p.B,  
                   labels = c("B", "C", "D"),
                   nrow = 1)
cowplot::plot_grid(p.full, row2, 
                   labels = c("A", ""),
                   nrow = 2)

Figure 4 (Brown et al 2023)

Abbi: This is for the heatmaps (Figure 4 in the paper). At the time I just added the annotations in the PDF after creating the file, but now I’d do them in R since i’m better at that now haha

TASKS/QUESTIONS:

  • What needs to be excluded (some PFAS are below the detection limit?)
  • No correlation done
df2<-rbind(d_wg_wat_noSum, d_wg_wat_2024) %>% 
  dplyr::group_by(SampleDate, Analyte) %>% 
  dplyr::summarise(meanPFAS = round(mean(value, na.rm = T), 1)) %>%
  mutate(meanPFAS = as.numeric(meanPFAS)) %>%
  pivot_wider(values_from = "meanPFAS", names_from = "Analyte") %>% 
  as.data.frame()

rownames(df2)<-df2$SampleDate
df2<-as.matrix(df2[,-1])

heatmaply(df2,
              scale_fill_gradient_fun = ggplot2::scale_fill_gradientn(
                colours = c("darkgreen","lightgreen","yellow","orange","red","darkred")),
              dendrogram = "none",
              xlab = "", ylab = "",
              main = "",
              scale = "column",
              margins = c(40,80,10,10),
              grid_color = "white",
              grid_width = 0.00001,
              titleX = FALSE,
              hide_colorbar = FALSE,
              cellnote = df2,
              cellnote_textposition = "middle center",
              cellnote_size = 6,
              branches_lwd = 0.1,
              fontsize_row = 10,
              fontsize_col = 10,
              labCol = colnames(df2),
              labRow = rownames(df2),
              heatmap_layers = theme(axis.line=element_blank(),
                                     # axis.text.x=element_blank(),
                                     axis.ticks.x=element_blank(),
                                     axis.text.y=element_text(size = 12,
                                                              color="black",
                                                              family = "Times New Roman"))
              # file = c("./Heatmap.pdf",
              #          height=5, width=4, res=2000),
)

Figure 5 (Brown et al 2023)

Abbi: This plot should be box plots of sediment PFHxS, PFOS, and PFOA concentrations at each of the sampling locations (Figure 5 in paper).

TASKS/QUESTIONS:

  • There is no new sediment and the paper doesn’t have this type of plot for water, we make ‘water plots’ too?
  • No stats done
# no new sediment
phhxs_wg_sed<-rbind(d_wg_sed_noSum) %>% 
  filter(Analyte == "PFHxS") %>% 
  mutate(SampleID = factor(SampleID)) %>% 
  ggplot(aes(Spp_Loc, value))+
    xlab("") +
    ylab("PFAS Concentration (ng/g)") +
    ggtitle("PFHxS in sediment") +
    stat_boxplot(aes(Spp_Loc, value),
                 geom='errorbar', linetype=1, width=0.25)+
    geom_boxplot(aes(Spp_Loc, value),
                 outlier.shape=3,
                 outlier.color = "black",
                 outlier.size = 2.5,
                 color = "black") + 
    stat_summary(fun=mean, geom="point", size=1.5, color = "red") +
    theme_bw() +
    theme(plot.title = element_text(hjust = (0.5), size = 10))

# make water plots?
pfhxs_wg_all<-rbind(d_wg_wat_noSum, d_wg_wat_2024) %>% 
  filter(Analyte == "PFHxS") %>% 
  mutate(SampleID = factor(SampleID)) %>% 
  ggplot(aes(Spp_Loc, value))+
    xlab("") +
    ylab("PFAS Concentration (ng/g)") +
    ggtitle("PFHxS in water") +
    stat_boxplot(aes(Spp_Loc, value),
                 geom='errorbar', linetype=1, width=0.25)+
    geom_boxplot(aes(Spp_Loc, value),
                 outlier.shape=3,
                 outlier.color = "black",
                 outlier.size = 2.5,
                 color = "black") + 
    stat_summary(fun=mean, geom="point", size=1.5, color = "red") +
    theme_bw() +
    theme(plot.title = element_text(hjust = (0.5), size = 10))

phos_wg_all<-rbind(d_wg_wat_noSum, d_wg_wat_2024) %>% 
  filter(Analyte == "PFOS") %>% 
  mutate(SampleID = factor(SampleID)) %>% 
  ggplot(aes(Spp_Loc, value))+
    xlab("") +
    ylab("PFAS Concentration (ng/g)") +
    ggtitle("PFOS in water") +
    stat_boxplot(aes(Spp_Loc, value),
                 geom='errorbar', linetype=1, width=0.25)+
    geom_boxplot(aes(Spp_Loc, value),
                 outlier.shape=3,
                 outlier.color = "black",
                 outlier.size = 2.5,
                 color = "black") + 
    stat_summary(fun=mean, geom="point", size=1.5, color = "red") +
    theme_bw() +
    theme(plot.title = element_text(hjust = (0.5), size = 10))

phoa_wg_all<-rbind(d_wg_wat_noSum, d_wg_wat_2024) %>% 
  filter(Analyte == "PFOA") %>% 
  mutate(SampleID = factor(SampleID)) %>% 
  ggplot(aes(Spp_Loc, value))+
    xlab("") +
    ylab("PFAS Concentration (ng/g)") +
    ggtitle("PFOA in water") +
    stat_boxplot(aes(Spp_Loc, value),
                 geom='errorbar', linetype=1, width=0.25)+
    geom_boxplot(aes(Spp_Loc, value),
                 outlier.shape=3,
                 outlier.color = "black",
                 outlier.size = 2.5,
                 color = "black") + 
    stat_summary(fun=mean, geom="point", size=1.5, color = "red") +
    theme_bw() +
    theme(plot.title = element_text(hjust = (0.5), size = 10))

cowplot::plot_grid(pfhxs_wg_all, phoa_wg_all, phoa_wg_all, 
                   nrow = 1)